Data anlysis with COVID-19 data from Brazil until May/2021
Hiago W. Petris - 22/05/2021
setwd("C:/Users/hiago/OneDrive/Projetos/Analise-Dados-Covid/")
library(tidyverse) # metapackage of all tidyverse packages
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.1 v dplyr 1.0.6
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(readr)
library(dplyr)
arq = "C:/Users/hiago/Downloads/caso.csv"
df = read_csv(arq)
##
## -- Column specification --------------------------------------------------------
## cols(
## date = col_date(format = ""),
## state = col_character(),
## city = col_character(),
## place_type = col_character(),
## confirmed = col_double(),
## deaths = col_double(),
## order_for_place = col_double(),
## is_last = col_logical(),
## estimated_population_2019 = col_double(),
## estimated_population = col_double(),
## city_ibge_code = col_double(),
## confirmed_per_100k_inhabitants = col_double(),
## death_rate = col_double()
## )
# View(df)
head(df)
## # A tibble: 6 x 13
## date state city place_type confirmed deaths order_for_place is_last
## <date> <chr> <chr> <chr> <dbl> <dbl> <dbl> <lgl>
## 1 2021-05-21 AP <NA> state 109906 1654 424 TRUE
## 2 2021-05-20 AP <NA> state 109777 1645 423 FALSE
## 3 2021-05-19 AP <NA> state 109625 1635 422 FALSE
## 4 2021-05-18 AP <NA> state 109479 1628 421 FALSE
## 5 2021-05-17 AP <NA> state 109272 1622 420 FALSE
## 6 2021-05-16 AP <NA> state 109070 1615 419 FALSE
## # ... with 5 more variables: estimated_population_2019 <dbl>,
## # estimated_population <dbl>, city_ibge_code <dbl>,
## # confirmed_per_100k_inhabitants <dbl>, death_rate <dbl>
str(df)
## spec_tbl_df[,13] [1,970,810 x 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ date : Date[1:1970810], format: "2021-05-21" "2021-05-20" ...
## $ state : chr [1:1970810] "AP" "AP" "AP" "AP" ...
## $ city : chr [1:1970810] NA NA NA NA ...
## $ place_type : chr [1:1970810] "state" "state" "state" "state" ...
## $ confirmed : num [1:1970810] 109906 109777 109625 109479 109272 ...
## $ deaths : num [1:1970810] 1654 1645 1635 1628 1622 ...
## $ order_for_place : num [1:1970810] 424 423 422 421 420 419 418 417 416 415 ...
## $ is_last : logi [1:1970810] TRUE FALSE FALSE FALSE FALSE FALSE ...
## $ estimated_population_2019 : num [1:1970810] 845731 845731 845731 845731 845731 ...
## $ estimated_population : num [1:1970810] 861773 861773 861773 861773 861773 ...
## $ city_ibge_code : num [1:1970810] 16 16 16 16 16 16 16 16 16 16 ...
## $ confirmed_per_100k_inhabitants: num [1:1970810] 12753 12739 12721 12704 12680 ...
## $ death_rate : num [1:1970810] 0.015 0.015 0.0149 0.0149 0.0148 0.0148 0.0148 0.0148 0.0148 0.0148 ...
## - attr(*, "spec")=
## .. cols(
## .. date = col_date(format = ""),
## .. state = col_character(),
## .. city = col_character(),
## .. place_type = col_character(),
## .. confirmed = col_double(),
## .. deaths = col_double(),
## .. order_for_place = col_double(),
## .. is_last = col_logical(),
## .. estimated_population_2019 = col_double(),
## .. estimated_population = col_double(),
## .. city_ibge_code = col_double(),
## .. confirmed_per_100k_inhabitants = col_double(),
## .. death_rate = col_double()
## .. )
First part: Only data from state of Paraná
dfPR = df %>%
arrange(date) %>%
filter(state=='PR', place_type=='state')
head(dfPR)
## # A tibble: 6 x 13
## date state city place_type confirmed deaths order_for_place is_last
## <date> <chr> <chr> <chr> <dbl> <dbl> <dbl> <lgl>
## 1 2020-03-12 PR <NA> state 6 0 1 FALSE
## 2 2020-03-13 PR <NA> state 6 0 2 FALSE
## 3 2020-03-16 PR <NA> state 6 0 3 FALSE
## 4 2020-03-17 PR <NA> state 12 0 4 FALSE
## 5 2020-03-18 PR <NA> state 14 0 5 FALSE
## 6 2020-03-19 PR <NA> state 23 0 6 FALSE
## # ... with 5 more variables: estimated_population_2019 <dbl>,
## # estimated_population <dbl>, city_ibge_code <dbl>,
## # confirmed_per_100k_inhabitants <dbl>, death_rate <dbl>
nrow(dfPR)
## [1] 434
#View(dfPR)
Cumulative cases at Paraná
figCumulativeCases <- plot_ly(dfPR,x=~date,y=~confirmed,type='scatter',name='Cumulative cases',mode='lines')
figCumulativeCases <- figCumulativeCases %>% layout(xaxis=list(title='Date'),yaxis=list(title='Cumulative cases'))
figCumulativeCases <- figCumulativeCases %>% config(locale="pt-br")
figCumulativeCases
Cumulative cases and deaths
figCumulativeCasesAndDeaths <- figCumulativeCases %>% add_trace(y=~deaths,name='Cumulative deaths',mode='lines')
figCumulativeCasesAndDeaths
Death rate
figDeathRate <- plot_ly(dfPR,x=~date,y=~death_rate,type='scatter',name='Death rate',mode='lines')
figDeathRate <- figDeathRate %>% layout(xaxis=list(title='Date'),yaxis=list(title='Death rate'))
figDeathRate
New cases per day
# ?dplyr::lag
dfPRNewCases = dfPR
dfPRNewCases = dfPRNewCases %>%
mutate(new_cases = confirmed-lag(confirmed), .after=confirmed)
dfPRNewCases$new_cases[1] = dfPRNewCases$confirmed[1]
head(dfPRNewCases)
## # A tibble: 6 x 14
## date state city place_type confirmed new_cases deaths order_for_place
## <date> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 2020-03-12 PR <NA> state 6 6 0 1
## 2 2020-03-13 PR <NA> state 6 0 0 2
## 3 2020-03-16 PR <NA> state 6 0 0 3
## 4 2020-03-17 PR <NA> state 12 6 0 4
## 5 2020-03-18 PR <NA> state 14 2 0 5
## 6 2020-03-19 PR <NA> state 23 9 0 6
## # ... with 6 more variables: is_last <lgl>, estimated_population_2019 <dbl>,
## # estimated_population <dbl>, city_ibge_code <dbl>,
## # confirmed_per_100k_inhabitants <dbl>, death_rate <dbl>
#View(dfPRNewCases)
New cases
figNewCases <- plot_ly(dfPRNewCases,x=~date,y=~new_cases,type='scatter',name='New Cases',mode='lines')
figNewCases <- figNewCases %>% layout(xaxis=list(title='Date'),yaxis=list(title='New Cases'))
figNewCases <- figNewCases %>% config(locale="pt-br")
figNewCases
Analysis of the impact of a restriction measure, getting 14 days before and 14 days after the measure
- Restriction measure on 31/03
- Relax of the restriction measure on 05/04
dfPRRestrictionMeasure = dfPRNewCases %>%
filter(date>=as.Date('2021-03-31')-14, date<=as.Date('2021-04-05')+14) %>%
select(date, new_cases, death_rate)
head(dfPRRestrictionMeasure)
## # A tibble: 6 x 3
## date new_cases death_rate
## <date> <dbl> <dbl>
## 1 2021-03-17 5461 0.0183
## 2 2021-03-18 6469 0.0184
## 3 2021-03-19 8146 0.0186
## 4 2021-03-20 4758 0.0187
## 5 2021-03-21 2146 0.0187
## 6 2021-03-22 3224 0.0187
# View(dfPRRestrictionMeasure)
figRestrictionMeasure <- plot_ly(dfPRRestrictionMeasure,x=~date,y=~new_cases,type='scatter',name='New Cases',mode='lines')
figRestrictionMeasure <- figRestrictionMeasure %>% layout(xaxis=list(title='Date'),yaxis=list(title='New Cases'))
figRestrictionMeasure <- figRestrictionMeasure %>% config(locale="pt-br")
figRestrictionMeasure <- figRestrictionMeasure %>% add_segments(
name="Restriction Measure", line=list(color="orange"),
x=as.Date('2021-03-31'), xend=as.Date('2021-03-31'),
y=min(dfPRRestrictionMeasure$new_cases), yend=max(dfPRRestrictionMeasure$new_cases))
figRestrictionMeasure <- figRestrictionMeasure %>% layout(
shapes=list(
type="rect",fillcolor="orange", line=list(color='orange'), opacity=0.3,
x0=as.Date('2021-03-31'), x1=as.Date('2021-04-05'),
y0=min(dfPRRestrictionMeasure$new_cases),y1=max(dfPRRestrictionMeasure$new_cases)
)
)
figRestrictionMeasure <- figRestrictionMeasure %>% add_segments(
name="Relax of restriction measure", line=list(color="green"),
x=as.Date('2021-04-05'), xend=as.Date('2021-04-05'),
y=min(dfPRRestrictionMeasure$new_cases),yend=max(dfPRRestrictionMeasure$new_cases))
figRestrictionMeasure
Same analysis with Death reate
figRestrictionMeasure2 <- plot_ly(dfPRRestrictionMeasure,x=~date,y=~death_rate,type='scatter',name='Death rate',mode='lines')
figRestrictionMeasure2 <- figRestrictionMeasure2 %>% layout(xaxis=list(title='Date'),yaxis=list(title='Death rate'))
figRestrictionMeasure2 <- figRestrictionMeasure2 %>% config(locale="pt-br")
figRestrictionMeasure2 <- figRestrictionMeasure2 %>% add_segments(
name="Medida de restrição", line=list(color="orange"),
x=as.Date('2021-03-31'), xend=as.Date('2021-03-31'),
y=min(dfPRRestrictionMeasure$death_rate),yend=max(dfPRRestrictionMeasure$death_rate))
figRestrictionMeasure2 <- figRestrictionMeasure2 %>% layout(
shapes=list(
type="rect",fillcolor="orange", line=list(color='orange'), opacity=0.3,
x0=as.Date('2021-03-31'), x1=as.Date('2021-04-05'),
y0=min(dfPRRestrictionMeasure$death_rate),y1=max(dfPRRestrictionMeasure$death_rate)
)
)
figRestrictionMeasure2 <- figRestrictionMeasure2 %>% add_segments(name="Relax of restriction measure", line=list(color="green"), x=as.Date('2021-04-05'), xend=as.Date('2021-04-05'),y=min(dfPRRestrictionMeasure$death_rate),yend=max(dfPRRestrictionMeasure$death_rate))
figRestrictionMeasure2
Contamination rate
- Calculate contamination rate = new cases / new cases from a day before
- Change the scale of Y axis or apply some normalization could help in visualization, beacause it starts on 0 and than has a peak at 2, but then it keeps approximately at 1
dfPRContaminationRate = dfPRNewCases %>%
mutate(contamination_rate = round(confirmed/lag(confirmed),2), .after=new_cases) %>%
select(date,confirmed,contamination_rate)
dfPRContaminationRate$contamination_rate[1] = 0
head(dfPRContaminationRate)
## # A tibble: 6 x 3
## date confirmed contamination_rate
## <date> <dbl> <dbl>
## 1 2020-03-12 6 0
## 2 2020-03-13 6 1
## 3 2020-03-16 6 1
## 4 2020-03-17 12 2
## 5 2020-03-18 14 1.17
## 6 2020-03-19 23 1.64
# View(dfPRContaminationRate)
figContaminationRate <- plot_ly(dfPRContaminationRate,x=~date,y=~contamination_rate,type='scatter',name='Contamination Rate',mode='lines')
figContaminationRate <- figDeathRate %>% layout(xaxis=list(title='Date'),yaxis=list(title='Contamination Rate'))
figContaminationRate <- figDeathRate %>% config(locale="pt-br")
figContaminationRate
Deaths / New Cases
- Filter because it has and outlier
dfPRDeathsPerNewcases = dfPRNewCases %>%
mutate(deaths_per_new_cases = ifelse(new_cases==0, deaths, round(deaths/new_cases,2)), .after=new_cases) %>%
select(date,deaths,new_cases,deaths_per_new_cases) %>%
filter(deaths_per_new_cases < 1000)
head(dfPRDeathsPerNewcases)
## # A tibble: 6 x 4
## date deaths new_cases deaths_per_new_cases
## <date> <dbl> <dbl> <dbl>
## 1 2020-03-12 0 6 0
## 2 2020-03-13 0 0 0
## 3 2020-03-16 0 0 0
## 4 2020-03-17 0 6 0
## 5 2020-03-18 0 2 0
## 6 2020-03-19 0 9 0
figDeathsNewCases <- plot_ly(dfPRDeathsPerNewcases,x=~date,y=~deaths_per_new_cases,type='scatter',name='Deaths/New Cases',mode='lines')
figDeathsNewCases <- figDeathsNewCases %>% layout(xaxis=list(title='Date'),yaxis=list(title='Deaths/New Cases'))
figDeathsNewCases <- figDeathsNewCases %>% config(locale="pt-br")
figDeathsNewCases
# TODO: LM new cases per death
#ggplot(dfPRNewCases,aes(x=estimated_population,y=contamination_rate)) +
# geom_point() +
#stat_smooth(method="lm", col="red")
Data from Brasil Cities
# Creates dataframe with contamination rate and estimated population, only for cities
dfContaminationRatePopulation = df %>%
filter(place_type=='city') %>%
select(date,city,state,estimated_population,confirmed) %>%
arrange(city,state,date) %>%
mutate(contamination_rate = confirmed/lag(confirmed), .before=confirmed)
# Removes column confirmed.
# dfContaminationRatePopulation$confirmed = NULL
# Removes NA values
dfContaminationRatePopulation = dfContaminationRatePopulation %>%
filter(!is.na(contamination_rate) & !is.na(estimated_population))
# Calculates the mean of contamination rate for every city and the max value of estimated population
dfContaminationRatePopulation2 = dfContaminationRatePopulation %>%
group_by(city,state) %>%
summarise(meanContaminationRate = mean(contamination_rate), estimated_population = max(estimated_population)) %>%
filter(!is.infinite(meanContaminationRate))
## `summarise()` has grouped output by 'city'. You can override using the `.groups` argument.
head(dfContaminationRatePopulation2)
## # A tibble: 6 x 4
## # Groups: city [6]
## city state meanContaminationRate estimated_population
## <chr> <chr> <dbl> <dbl>
## 1 Abadia de Goiás GO 1.02 8958
## 2 Abadia dos Dourados MG 1.02 7006
## 3 Abadiânia GO 1.02 20461
## 4 Abaeté MG 1.02 23250
## 5 Abaetetuba PA 1.03 159080
## 6 Abaíra BA 1.02 8710
ggplot(dfContaminationRatePopulation2, aes(x=meanContaminationRate,y=estimated_population)) +
geom_point()

# There are cities with low population and high contamination rate
# But no city with high population has contamination rate > 1.05
# Check for Cities with higher population (Sao Paulo and Rio) or higher contamination rate
dfContaminationRatePopulation2 %>%
filter(estimated_population > 4.0e+06 | meanContaminationRate > 1.5)
## # A tibble: 4 x 4
## # Groups: city [4]
## city state meanContaminationRate estimated_population
## <chr> <chr> <dbl> <dbl>
## 1 Rio de Janeiro RJ 1.04 6747815
## 2 Santa Cruz do Sul RS 1.64 131365
## 3 Santa Margarida do Sul RS 2.55 2578
## 4 São Paulo SP 1.04 12325232
# Filter for "Santa Margarida do Sul" in the original dataframe
View(df %>% filter(city=='Santa Margarida do Sul') %>% arrange(date) %>% filter(confirmed>23000))
# Apparently it has an error on day 2021-04-04, with 23326 confirmed cases
Analyse distribution of the contamination rate means
# Filter by meanContaminationRete < 1.05 to facilitate visualization
dfContaminationRatePopulation3 = dfContaminationRatePopulation2 %>%
filter(meanContaminationRate < 1.05) %>%
arrange(meanContaminationRate)
mean(dfContaminationRatePopulation3$meanContaminationRate)
## [1] 1.020475
median(dfContaminationRatePopulation3$meanContaminationRate)
## [1] 1.019825
quantile(dfContaminationRatePopulation3$meanContaminationRate)
## 0% 25% 50% 75% 100%
## 1.002164 1.016944 1.019825 1.023171 1.049820
ggplot(dfContaminationRatePopulation3, aes(x=meanContaminationRate)) + geom_density()
